mba_data <- read_excel("online-retail.xlsx")
transactions <- mbar_prep_data(mba_data, InvoiceNo, Description)
head(transactions)## # A tibble: 6 x 1,114
## item_1 item_2 item_3 item_4 item_5 item_6 item_7 item_8 item_9 item_10
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 BATH ~ "" "" "" "" "" "" "" "" ""
## 2 PAPER~ "" "" "" "" "" "" "" "" ""
## 3 VICTO~ "" "" "" "" "" "" "" "" ""
## 4 JAM M~ "" "" "" "" "" "" "" "" ""
## 5 Disco~ "" "" "" "" "" "" "" "" ""
## 6 SET O~ "" "" "" "" "" "" "" "" ""
## # ... with 1,104 more variables: item_11 <chr>, item_12 <chr>,
## # item_13 <chr>, item_14 <chr>, item_15 <chr>, item_16 <chr>,
## # item_17 <chr>, item_18 <chr>, item_19 <chr>, item_20 <chr>,
## # item_21 <chr>, item_22 <chr>, item_23 <chr>, item_24 <chr>,
## # item_25 <chr>, item_26 <chr>, item_27 <chr>, item_28 <chr>,
## # item_29 <chr>, item_30 <chr>, item_31 <chr>, item_32 <chr>,
## # item_33 <chr>, item_34 <chr>, item_35 <chr>, item_36 <chr>,
## # item_37 <chr>, item_38 <chr>, item_39 <chr>, item_40 <chr>,
## # item_41 <chr>, item_42 <chr>, item_43 <chr>, item_44 <chr>,
## # item_45 <chr>, item_46 <chr>, item_47 <chr>, item_48 <chr>,
## # item_49 <chr>, item_50 <chr>, item_51 <chr>, item_52 <chr>,
## # item_53 <chr>, item_54 <chr>, item_55 <chr>, item_56 <chr>,
## # item_57 <chr>, item_58 <chr>, item_59 <chr>, item_60 <chr>,
## # item_61 <chr>, item_62 <chr>, item_63 <chr>, item_64 <chr>,
## # item_65 <chr>, item_66 <chr>, item_67 <chr>, item_68 <chr>,
## # item_69 <chr>, item_70 <chr>, item_71 <chr>, item_72 <chr>,
## # item_73 <chr>, item_74 <chr>, item_75 <chr>, item_76 <chr>,
## # item_77 <chr>, item_78 <chr>, item_79 <chr>, item_80 <chr>,
## # item_81 <chr>, item_82 <chr>, item_83 <chr>, item_84 <chr>,
## # item_85 <chr>, item_86 <chr>, item_87 <chr>, item_88 <chr>,
## # item_89 <chr>, item_90 <chr>, item_91 <chr>, item_92 <chr>,
## # item_93 <chr>, item_94 <chr>, item_95 <chr>, item_96 <chr>,
## # item_97 <chr>, item_98 <chr>, item_99 <chr>, item_100 <chr>,
## # item_101 <chr>, item_102 <chr>, item_103 <chr>, item_104 <chr>,
## # item_105 <chr>, item_106 <chr>, item_107 <chr>, item_108 <chr>,
## # item_109 <chr>, item_110 <chr>, ...
## [1] 20.92313
## [1] 10
## # A tibble: 4,212 x 2
## Description count
## <chr> <int>
## 1 WHITE HANGING HEART T-LIGHT HOLDER 2369
## 2 REGENCY CAKESTAND 3 TIER 2200
## 3 JUMBO BAG RED RETROSPOT 2159
## 4 PARTY BUNTING 1727
## 5 LUNCH BAG RED RETROSPOT 1638
## 6 ASSORTED COLOUR BIRD ORNAMENT 1501
## 7 SET OF 3 CAKE TINS PANTRY DESIGN 1473
## 8 <NA> 1454
## 9 PACK OF 72 RETROSPOT CAKE CASES 1385
## 10 LUNCH BAG BLACK SKULL. 1350
## # ... with 4,202 more rows
total_revenue <-
mba_data %>%
group_by(InvoiceNo) %>%
summarize(order_sum = sum(UnitPrice)) %>%
pull(order_sum) %>%
sum()
total_transactions <-
mba_data %>%
group_by(InvoiceNo) %>%
summarize(n()) %>%
nrow()
total_revenue / total_transactions## [1] 96.47892
## transactions in sparse format with
## 25901 transactions (rows) and
## 10085 items (columns)
## transactions as itemMatrix in sparse format with
## 25901 rows (elements/itemsets/transactions) and
## 10085 columns (items) and a density of 0.001660018
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 1999 1914
## JUMBO BAG RED RETROSPOT PARTY BUNTING
## 1806 1488
## LUNCH BAG RED RETROSPOT (Other)
## 1404 425005
##
## element (itemset/transaction) length distribution:
## sizes
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 1454 4578 1727 1208 942 891 781 715 696 683 612 642 547 530 543
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 555 537 479 459 491 428 405 328 311 280 248 261 235 221 233
## 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 224 175 174 145 149 139 122 119 100 117 98 94 102 93 72
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 73 74 71 69 68 59 70 49 49 54 57 42 32 42 39
## 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 34 40 22 27 30 24 34 28 25 21 23 26 14 17 24
## 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
## 11 18 14 13 10 16 18 15 10 9 16 13 16 13 7
## 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
## 8 12 12 8 7 7 4 7 9 5 8 8 4 5 7
## 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
## 2 3 7 9 4 7 4 2 7 1 1 4 7 6 2
## 120 121 122 123 124 125 126 127 129 130 131 132 133 134 135
## 3 5 4 4 2 5 6 2 1 4 3 6 6 3 4
## 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
## 3 2 1 1 3 8 5 3 4 4 6 2 3 1 4
## 151 152 153 154 155 156 157 158 159 160 162 163 164 167 168
## 3 2 4 7 3 3 5 2 4 5 1 2 1 3 5
## 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
## 2 2 4 3 1 3 5 1 2 2 2 2 1 2 1
## 184 185 186 187 189 190 192 193 194 196 197 198 201 202 204
## 2 1 1 2 2 1 1 5 1 2 3 2 1 1 2
## 205 206 207 208 209 212 213 215 219 220 224 226 227 228 230
## 2 1 3 3 2 1 2 2 7 1 3 3 1 1 2
## 232 234 236 238 240 241 244 248 249 250 252 256 257 258 260
## 1 2 1 2 2 2 1 1 2 2 1 1 1 1 2
## 261 263 265 266 270 272 281 284 285 298 299 301 303 304 305
## 1 2 1 1 1 1 1 1 2 1 2 1 1 1 3
## 312 314 316 320 321 326 327 329 332 333 338 339 341 344 348
## 2 1 1 2 1 1 1 1 1 1 1 1 1 2 1
## 350 360 365 367 375 391 394 398 400 402 405 411 419 422 429
## 1 2 1 1 3 1 1 1 1 1 1 1 2 1 1
## 431 442 447 460 468 471 477 509 514 530 587 627 1114
## 2 1 1 1 1 1 1 1 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 8.00 16.74 20.00 1114.00
##
## includes extended item information - examples:
## labels
## 1 *Boombox Ipod Classic
## 2 *USB Office Mirror Ball
## 3 ?
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.009 1
## maxlen target ext
## 4 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 233
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10085 item(s), 25901 transaction(s)] done [2.75s].
## sorting and recoding items ... [508 item(s)] done [0.06s].
## creating transaction tree ... done [0.30s].
## checking subsets of size 1 2 3 4
## Warning in apriori(basket_data, parameter = list(supp = 0.009, conf =
## 0.8, : Mining stopped (maxlen reached). Only patterns up to a length of 4
## returned!
## done [0.21s].
## writing ... [22 rule(s)] done [0.00s].
## creating S4 object ... done [0.05s].
## set of 22 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 11 9 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.500 2.591 3.000 4.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.009034 Min. :0.8035 Min. :22.59 Min. :234.0
## 1st Qu.:0.010453 1st Qu.:0.8530 1st Qu.:25.02 1st Qu.:270.8
## Median :0.013223 Median :0.8868 Median :55.94 Median :342.5
## Mean :0.012760 Mean :0.9120 Mean :48.55 Mean :330.5
## 3rd Qu.:0.014362 3rd Qu.:1.0000 3rd Qu.:61.23 3rd Qu.:372.0
## Max. :0.018339 Max. :1.0000 Max. :71.30 Max. :475.0
##
## mining info:
## data ntransactions support confidence
## basket_data 25901 0.009 0.8
## lhs rhs support confidence lift count
## [1] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [2] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [3] {SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [4] {SET 3 RETROSPOT TEA} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [5] {SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [6] {SHED} => {KEY FOB} 0.011273696 1.0000000 61.23168 292
## [7] {SET 3 RETROSPOT TEA,
## SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [8] {COFFEE,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [9] {COFFEE,
## SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [10] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.009999614 0.8900344 25.16679 259
## {SET 3 RETROSPOT TEA,SUGAR}
## 3
## {SET 3 RETROSPOT TEA,SUGAR}
## 4
## {COFFEE,SET 3 RETROSPOT TEA}
## 5
## {COFFEE,SET 3 RETROSPOT TEA}
## 6
## {COFFEE,SUGAR}
## 7
## {COFFEE,SUGAR}
## 8
## {COFFEE,SET 3 RETROSPOT TEA,SUGAR}
## 12
## {COFFEE,SET 3 RETROSPOT TEA,SUGAR}
## 13
## {COFFEE,SET 3 RETROSPOT TEA,SUGAR}
## 14
## {SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER CUPS,SET/6 RED SPOTTY PAPER PLATES}
## 15
## {GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER}
## 16
## {GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER}
## 17
## {GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}
## 21
## {GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}
## 22
## lhs rhs support confidence lift count
## [1] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [2] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [3] {COFFEE,
## SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [4] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.009999614 0.8900344 25.16679 259
## [5] {SET/20 RED RETROSPOT PAPER NAPKINS,
## SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.009111617 0.8872180 48.68609 236
## [6] {REGENCY TEA PLATE GREEN} => {REGENCY TEA PLATE ROSES} 0.010347091 0.8322981 55.99313 268
## [7] {STRAWBERRY CHARLOTTE BAG,
## WOODLAND CHARLOTTE BAG} => {RED RETROSPOT CHARLOTTE BAG} 0.010771785 0.8110465 23.65644 279
## [8] {SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.012084476 0.8087855 44.38211 313
sugar_rules <- apriori(basket_data, parameter = list(supp = 0.009, conf = 0.8),
appearance = list(default = "lhs", rhs = "SUGAR"),
control = list(verbose = F))
rules_sugar <- sort(sugar_rules, by = "confidence", decreasing = TRUE)
inspect(rules_sugar)## lhs rhs support confidence lift
## [1] {SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238 1.0000000 69.62634
## [2] {COFFEE,SET 3 RETROSPOT TEA} => {SUGAR} 0.01436238 1.0000000 69.62634
## [3] {COFFEE} => {SUGAR} 0.01436238 0.8034557 55.94168
## count
## [1] 372
## [2] 372
## [3] 372
sugar_rules <- apriori(basket_data, parameter = list(supp = 0.009, conf = 0.8),
appearance = list(default = "rhs", lhs = "SUGAR"),
control = list(verbose = F))
rules_sugar <- sort(sugar_rules, by = "confidence", decreasing = TRUE)
inspect(rules_sugar)## lhs rhs support confidence lift count
## [1] {SUGAR} => {SET 3 RETROSPOT TEA} 0.01436238 1 69.62634 372
## [2] {SUGAR} => {COFFEE} 0.01436238 1 55.94168 372
## lhs rhs support confidence lift count
## [1] {BACK DOOR} => {KEY FOB} 0.009613528 1.0000000 61.23168 249
## [2] {SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [3] {SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [4] {SET 3 RETROSPOT TEA} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [5] {SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [6] {SHED} => {KEY FOB} 0.011273696 1.0000000 61.23168 292
## [7] {SET 3 RETROSPOT TEA,
## SUGAR} => {COFFEE} 0.014362380 1.0000000 55.94168 372
## [8] {COFFEE,
## SET 3 RETROSPOT TEA} => {SUGAR} 0.014362380 1.0000000 69.62634 372
## [9] {COFFEE,
## SUGAR} => {SET 3 RETROSPOT TEA} 0.014362380 1.0000000 69.62634 372
## [10] {PINK REGENCY TEACUP AND SAUCER,
## REGENCY CAKESTAND 3 TIER,
## ROSES REGENCY TEACUP AND SAUCER} => {GREEN REGENCY TEACUP AND SAUCER} 0.009999614 0.8900344 25.16679 259
WHAT
unsupervised data mining technique employed by retailers to understand purchase behaviors and used to determine what items are frequently bought together
WHY
store layout, online recommendation engines, targeted marketing campaigns, cross-selling, up-selling, catalogue design, enhance customer experience etc
ADVANTAGES
cost-effective, insightful, flexible and actionable
USAGE
retail, telecommunications, banks, insurance, medical, manufacturing
WARNING